Loading packages:
library(Seurat)
library(tidyverse)
library(gprofiler2)
library(sleepwalk)
library(SCINA)
library(gridExtra)
theme_set(theme_bw(base_size = 14))#Loading data
setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024/Processed data to be loaded in R")
data_dir_GoT <- '/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024/Processed data to be loaded in R/AML1022.1_GoT'
data_dir_nanoranger <- '/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024/Processed data to be loaded in R/AML1022.1_nanoranger'
AML_GoT <- Read10X(
data_dir_GoT,
gene.column = 2,
cell.column = 1,
unique.features = TRUE,
strip.suffix = FALSE
)
AML_nanoranger <- Read10X(
data_dir_nanoranger,
gene.column = 2,
cell.column = 1,
unique.features = TRUE,
strip.suffix = FALSE
)seurat_object_AML_GoT = CreateSeuratObject(AML_GoT)
seurat_object_AML_nanoranger = CreateSeuratObject(AML_nanoranger)seurat_object_AML_GoT$orig.ident = 'AML_GoT'
seurat_object_AML_nanoranger$orig.ident = 'AML_nanoranger'
AML_merged <- merge(seurat_object_AML_GoT, y = seurat_object_AML_nanoranger, project = "AML_merged")## Warning in CheckDuplicateCellNames(object.list = objects): Some cell names are
## duplicated across objects provided. Renaming to enforce unique cell names.
as_tibble(AML_merged@meta.data) %>%
group_by(orig.ident) %>%
summarise(initial_cells=n()) -> cell_counts
cell_countsPercentageFeatureSet(AML_merged,pattern="^MT-") -> AML_merged$percent_mt
PercentageFeatureSet(AML_merged,pattern="MALAT1") -> AML_merged$percent_Malat
apply(AML_merged@assays$RNA@counts, 2, function(x) max((100*x)/sum(x))) -> AML_merged$percent_Largest
AML_merged[[]]as_tibble(AML_merged@meta.data) -> qc_metrics
head(qc_metrics)qc_metrics %>%
pivot_longer(cols=-orig.ident, names_to="metric",values_to="value") %>%
ggplot(aes(x=orig.ident, y=value)) +
geom_violin(fill="lightskyblue")+
facet_grid(rows=vars(metric), scales="free_y")ggsave(paste0("Violin plot linear scale no intercept "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 8, height = 10, dpi = 300)
qc_metrics %>%
pivot_longer(cols=-orig.ident, names_to="metric",values_to="value") %>%
ggplot(aes(x=orig.ident, y=log(value))) +
geom_violin(fill="lightskyblue")+
facet_grid(rows=vars(metric), scales="free_y")ggsave(paste0("Violin plot log scale no intercept "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 8, height = 10, dpi = 300)setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024")
NormalizeData(AML_merged, normalization.method = "CLR") -> AML_merged
saveRDS(AML_merged, file="AML_merged_norm_latest.RDS")setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024")
subset <- AML_merged@assays$RNA@data[,c(rep(F,69),T)] %>%
as_tibble() %>%
pivot_longer(
cols=everything(),
names_to="cell",
values_to="value"
) %>%
left_join(
as_tibble(AML_merged@meta.data, rownames="cell") %>%
dplyr::select(cell,orig.ident)
)
subset %>%
ggplot(aes(x=value, colour=orig.ident, group=cell)) +
geom_density(linewidth=0.25)+
coord_cartesian(xlim=c(0,3), ylim=c(0,1)) +
scale_colour_brewer(palette = "Set1")ggsave(paste0("Post norm check all samples together "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 8, height = 5, dpi = 300)
subset %>%
ggplot(aes(x=value, colour=orig.ident, group=cell)) +
geom_density(linewidth=0.25)+
coord_cartesian(xlim=c(0,3), ylim=c(0,1)) +
scale_colour_brewer(palette = "Set1") +
facet_wrap(vars(orig.ident))ggsave(paste0("Post norm check 4 samples separate "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 8, height = 5, dpi = 300)FindVariableFeatures(
AML_merged,
selection.method = "vst",
nfeatures = 500
) -> AML_merged
as_tibble(HVFInfo(AML_merged),rownames = "Gene") -> variance.data
variance.data %>%
mutate(hypervariable=Gene %in% VariableFeatures(AML_merged)
) -> variance.data
variance.data %>%
arrange(desc(variance.standardized)) -> variance.data
head(variance.data,n=100)setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024")
variance.data %>%
arrange(hypervariable) %>%
ggplot(aes(x=mean,y=variance, colour=hypervariable)) +
geom_point() +
scale_y_log10() +
scale_x_log10() +
scale_colour_manual(values=c("grey","red2"))## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
ggsave(paste0("HVG gene plot "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 8, height = 5, dpi = 300)## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous x-axis
#Extract gene list for GO analysis
#https://www.bioinformatics.babraham.ac.uk/goliath/goliath.cgi
variance.data %>%
arrange(desc(variance.standardized)) %>%
slice(1:100) %>%
pull(Gene) %>%
sapply(function(x)cat(paste0(x,"\n"))) -> temp## CA1
## DLK1
## AHSP
## HBB
## PRTN3
## HBD
## HBM
## PPBP
## MPO
## HBG2
## ALAS2
## CA2
## IGLV1-44
## HBA2
## TRBV7-2
## PF4
## GNLY
## HBA1
## APOC1
## THBS1
## SLC4A1
## HIST1H4C
## PRDX2
## IGHV4-34
## AZU1
## SLC25A37
## HIST1H1B
## MS4A2
## TRBV5-1
## CLEC1B
## IFI27
## GP9
## TRBV7-3
## PRSS2
## TRBV7-9
## CCL4
## SNCA
## TRBV3-1
## ZNF385D
## TRBV10-3
## TRBV12-3
## TRBV11-2
## GDF15
## IFIT1B
## CTSG
## XACT
## GAL
## TRBV24-1
## CD1E
## TRBV6-2
## TRBV6-5
## PTGDS
## HDC
## CD9
## TRBV6-6
## JCHAIN
## TRBV9
## TRBV18
## TSPO2
## TRBV28
## TRBV30
## XCL1
## ITGB3
## TRBV13
## HBG1
## LTBP1
## TRBV14
## FCER1A
## TUBB1
## TRBV4-1
## TRBV7-6
## TRBV5-4
## RRM2
## TRBV29-1
## TRBV2
## MKI67
## GZMB
## IGHA1
## TRBV5-6
## ITGA2B
## TRDV2
## HIST1H3B
## ACY3
## ANK1
## MPIG6B
## GYPB
## TRBV4-2
## TRBV19
## LTB
## IL32
## DNASE1L3
## TRAV21
## IFIT2
## TRAV13-1
## ESAM
## IGLC2
## PRF1
## TRAV38-2DV8
## TRBV6-1
## KIAA1671
setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024")
ScaleData(AML_merged, features=rownames(data)) -> AML_merged## Centering and scaling data matrix
RunPCA(
AML_merged,
features=VariableFeatures(AML_merged)
) -> AML_merged## PC_ 1
## Positive: TUBB, HMGA1, STMN1, PCLAF, TYMS, MIF, TUBA1B, DUT, MCM7, NME1
## HIST1H4C, MPC2, H2AFZ, PRSS57, CENPU, IGLL1, GATA1, FABP5, KLF1, PCNA
## PDLIM1, FEN1, MIR924HG, MYC, MINPP1, MRPL16, DTL, KCNH2, REXO2, PKMYT1
## Negative: S100A9, TYROBP, S100A8, VCAN, FCN1, IFI30, CD14, SERPINA1, MNDA, NCF1
## LYZ, TMEM176B, CEBPD, PLXDC2, LGALS2, S100A12, TMEM176A, CST3, FPR1, CFD
## RETN, S100P, IL1R2, PID1, PLAUR, JUN, CDKN1A, TIMP1, MAFB, CCL3
## PC_ 2
## Positive: IL32, CD3E, CD247, CD7, GZMA, LTB, IL7R, PRF1, CD69, KLRB1
## GNLY, CST7, GZMB, KLRD1, GZMH, FGFBP2, CLIC3, PRSS2, CD27, CD8A
## GZMK, CCL5, FCGR3A, IGHM, NKG7, NKAIN2, AQP3, SPON2, KLRC1, CD8B
## Negative: LYZ, IFI30, CST3, FCN1, VCAN, S100A9, S100A8, SERPINA1, MNDA, CD14
## CEBPD, TYROBP, CFD, TMEM176B, LGALS2, NCF1, BLVRB, TIMP1, PLXDC2, TMEM176A
## S100A12, CDKN1A, RETN, CD36, FPR1, PLAUR, S100P, NRGN, IL1R2, TEX14
## PC_ 3
## Positive: C1QTNF4, STMN1, MIF, HLA-DPB1, PRSS57, HLA-DPA1, NKAIN2, FABP5, IGHM, IGLL1
## NME1, H2AFZ, CD38, AL589693.1, MZB1, HMGA1, TUBA1B, DUT, MRPL16, TUBB
## HLA-DQA1, HLA-DQA2, WDR49, MIR924HG, CD69, AC002454.1, SLC24A3, MPO, MCM7, TXNDC5
## Negative: ALAS2, HBA1, AHSP, HBA2, CA1, HBB, HBM, SELENBP1, SLC25A37, SLC4A1
## SNCA, EPB42, GYPB, HBD, CA2, HBQ1, FECH, SLC25A39, GMPR, PRDX2
## PHOSPHO1, IFI27, TSPO2, ANK1, TNS1, MYL4, RHD, SMIM1, IFIT1B, DCAF12
## PC_ 4
## Positive: C1QTNF4, NKAIN2, IGHM, PRSS2, SCN3A, H1F0, AL589693.1, MZB1, HEMGN, JCHAIN
## ACY3, HLA-DPB1, GNG11, TRIM58, HLA-DPA1, COL24A1, HBA2, WDR49, PRSS57, MMRN1
## ALAS2, HBA1, SLC24A3, SNCA, HBB, STMN1, HLA-DQA1, HLA-DQA2, SLC25A37, ADGRB3
## Negative: CST7, GZMA, GNLY, PRF1, CD247, CD7, GZMB, NKG7, KLRD1, CCL5
## FGFBP2, FCGR3A, GZMH, KLRB1, CLIC3, CCL4, IL32, KLRC1, SPON2, CD3E
## XCL2, LAT, JUN, DUSP2, CCL4L2, ZNF331, GZMK, CRIP1, CD8A, IL7R
## PC_ 5
## Positive: ZNF385D, GATA1, CSF2RB, SEMA3C, MYO16, CYP7B1, GATA2, TRIB2, SLC40A1, CSF1
## ANGPT2, PDLIM1, KCNH2, LINC02284, FCER1A, GP9, HDC, TAFA2, KLF1, RBPMS2
## MS4A2, MS4A4E, SLC24A3, AC244502.1, PRKAR2B, SMIM1, ANKRD9, DLK1, MPIG6B, PF4
## Negative: C1QTNF4, AL589693.1, IGHM, MZB1, HLA-DPA1, HLA-DPB1, WDR49, MPO, NKAIN2, SCN3A
## NKG7, CTSG, DDIT4, HLA-DQA1, ACY3, HLA-DQA2, JCHAIN, H1F0, SLC25A39, AZU1
## TUBA1B, ID3, CD69, PRSS2, ID1, AHSP, ALAS2, HEMGN, DUSP2, MIR924HG
PCAPlot(AML_merged, dims=c(1,2), group.by= 'orig.ident')ggsave(paste0("PCA dim 1&2 "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 8, height = 6, dpi = 300)
PCAPlot(AML_merged, dims=c(3,4), group.by= 'orig.ident')ggsave(paste0("PCA dim 3&4 "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 8, height = 6, dpi = 300)
PCAPlot(AML_merged, dims=c(5,6), group.by= 'orig.ident')ggsave(paste0("PCA dim 5&6 "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 8, height = 6, dpi = 300)
PCAPlot(AML_merged, dims=c(7,8), group.by= 'orig.ident')ggsave(paste0("PCA dim 7&8 "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 8, height = 6, dpi = 300)
PCAPlot(AML_merged, dims=c(9,10), group.by= 'orig.ident')ggsave(paste0("PCA dim 9&10 "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 8, height = 6, dpi = 300)
ElbowPlot(AML_merged, ndims=40)ggsave(paste0("PCA SD vs dim "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 8, height = 5, dpi = 300)Idents(AML_merged) <- 'orig.ident'setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024")
saved.seed <- 5540
set.seed(saved.seed)
RunUMAP(
AML_merged,
dims=1:30,
n.neighbors = 50,
min.dist = 1,
seed.use = saved.seed
) -> AML_merged## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 14:39:13 UMAP embedding parameters a = 0.115 b = 1.929
## 14:39:13 Read 10119 rows and found 30 numeric columns
## 14:39:13 Using Annoy for neighbor search, n_neighbors = 50
## 14:39:13 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:39:15 Writing NN index file to temp file /var/folders/rx/vvf3pzf55v912fz97g52nz3m0000gn/T//RtmpwV81Hi/file13853dcb38ce
## 14:39:15 Searching Annoy index using 1 thread, search_k = 5000
## 14:39:21 Annoy recall = 100%
## 14:39:23 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 50
## 14:39:24 Initializing from normalized Laplacian + noise (using irlba)
## 14:39:25 Commencing optimization for 200 epochs, with 697754 positive edges
## 14:39:36 Optimization finished
DimPlot(AML_merged,reduction = "umap", pt.size = 0.3)ggsave(paste0("UMAP all samples "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 8, height = 8, dpi = 300)
DimPlot(AML_merged,reduction = "umap", pt.size = 0.3, split.by = "orig.ident", ncol = 2)ggsave(paste0("UMAP 3 samples separate "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 12, height = 10, dpi = 300)FindNeighbors(AML_merged, reduction = "pca", dims = 1:30) -> AML_merged## Computing nearest neighbor graph
## Computing SNN
AML_merged@graphs$RNA_snn[1:10,1:10]## 10 x 10 sparse Matrix of class "dgCMatrix"
## [[ suppressing 10 column names 'AAACCTGAGAATTGTG-1_1', 'AAACCTGAGCTGATAA-1_1', 'AAACCTGAGGCAATTA-1_1' ... ]]
##
## AAACCTGAGAATTGTG-1_1 1 . . . . . . . . .
## AAACCTGAGCTGATAA-1_1 . 1 . . . . . . . .
## AAACCTGAGGCAATTA-1_1 . . 1 . . . . . . .
## AAACCTGAGGTGCAAC-1_1 . . . 1 . . . . . .
## AAACCTGAGTGCAAGC-1_1 . . . . 1 . . . . .
## AAACCTGAGTGTACGG-1_1 . . . . . 1 . . . .
## AAACCTGCATCGGTTA-1_1 . . . . . . 1 . . .
## AAACCTGGTACCGCTG-1_1 . . . . . . . 1 . .
## AAACCTGGTCTCCATC-1_1 . . . . . . . . 1 .
## AAACCTGGTTCACGGC-1_1 . . . . . . . . . 1
FindClusters(AML_merged, resolution = 0.1) -> AML_merged## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 10119
## Number of edges: 379815
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9661
## Number of communities: 8
## Elapsed time: 1 seconds
head(AML_merged$seurat_clusters, n=50)## AAACCTGAGAATTGTG-1_1 AAACCTGAGCTGATAA-1_1 AAACCTGAGGCAATTA-1_1
## 7 1 1
## AAACCTGAGGTGCAAC-1_1 AAACCTGAGTGCAAGC-1_1 AAACCTGAGTGTACGG-1_1
## 2 0 2
## AAACCTGCATCGGTTA-1_1 AAACCTGGTACCGCTG-1_1 AAACCTGGTCTCCATC-1_1
## 1 0 0
## AAACCTGGTTCACGGC-1_1 AAACCTGTCACGCATA-1_1 AAACGGGAGAGCTTCT-1_1
## 3 1 4
## AAACGGGAGAGTTGGC-1_1 AAACGGGAGCGTAATA-1_1 AAACGGGAGGCTATCT-1_1
## 1 3 2
## AAACGGGGTACCGTTA-1_1 AAACGGGGTCACAAGG-1_1 AAACGGGGTCTCTTTA-1_1
## 0 3 2
## AAACGGGGTGTGACGA-1_1 AAACGGGTCACATAGC-1_1 AAACGGGTCAGCAACT-1_1
## 4 0 0
## AAACGGGTCCTGTAGA-1_1 AAACGGGTCGATAGAA-1_1 AAAGATGAGACTTGAA-1_1
## 3 0 5
## AAAGATGAGAGACTTA-1_1 AAAGATGAGCGTTTAC-1_1 AAAGATGAGGAATCGC-1_1
## 0 0 0
## AAAGATGAGGACGAAA-1_1 AAAGATGAGTTACCCA-1_1 AAAGATGCACCCATGG-1_1
## 0 4 0
## AAAGATGGTGATAAGT-1_1 AAAGATGGTGATGTGG-1_1 AAAGATGGTGGTGTAG-1_1
## 1 1 1
## AAAGATGGTGTTCGAT-1_1 AAAGATGTCTCGATGA-1_1 AAAGCAAAGACGCACA-1_1
## 3 0 3
## AAAGCAAAGATAGGAG-1_1 AAAGCAAAGGAGCGTT-1_1 AAAGCAAAGTAAGTAC-1_1
## 7 0 1
## AAAGCAAAGTCCATAC-1_1 AAAGCAACACAGACAG-1_1 AAAGCAACATTCCTCG-1_1
## 0 3 3
## AAAGCAAGTCATATCG-1_1 AAAGCAAGTGTGTGCC-1_1 AAAGCAATCAATCACG-1_1
## 1 2 1
## AAAGCAATCACAACGT-1_1 AAAGCAATCTACTTAC-1_1 AAAGTAGAGCGTCTAT-1_1
## 1 2 5
## AAAGTAGAGGGTTCCC-1_1 AAAGTAGCAATCCGAT-1_1
## 1 3
## Levels: 0 1 2 3 4 5 6 7
setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024")
DimPlot(AML_merged,reduction = "umap", pt.size = 0.3, label = TRUE, label.size = 8)ggsave(paste0("UMAP 2 samples with SNN clusters resolution 0.1 "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 8, height = 8, dpi = 300)
DimPlot(AML_merged,reduction = "umap", pt.size = 0.3, label = TRUE, label.size = 4, split.by = "orig.ident", ncol = 2) ggsave(paste0("UMAP 2 samples separate with SNN clusters resolution 0.1 "
, format(Sys.time(), "%Y-%m-%d")
, ".pdf"), width = 12, height = 10, dpi = 300)